Exercise 1. Loading the dataset (raw_data)

raw_data<-read.csv("../output/raw_counts.csv.gz")

Exercise 4.Make a histogram of counts for each of the samples Conclusion: the data is not normally distributed

library(tidyr)
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#change data into long format
raw_data_long_format <- raw_data %>% 
  pivot_longer(c(-geneID), names_to="sample", values_to="count")

#Make a histogram
ggplot(raw_data_long_format, aes(x = log2(count))) +
  geom_histogram(bins = 10, fill = "blue", color = "black") +
  theme_minimal() +
  labs(title = "Histogram of log2(Counts)", x = "Count", y = "Frequency")
## Warning: Removed 3192188 rows containing non-finite outside the scale range
## (`stat_bin()`).

raw_data <- raw_data[rowSums(raw_data[,-1] > 10) >= 3,]#For our subsequent analyses we want to reduce the data set to only those genes with some expression. In this case we will retain genes with > 10 reads in >= 3 samples

Exercise 5.check for correlation and visualize it

library(tidyr)
library(dplyr)
library(gplots)
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library(viridis)
## Loading required package: viridisLite
library(stringr)

raw_data_limited <- raw_data[1:1000, -which(colnames(raw_data) == "geneID")]
# Adjust margins: c(bottom, left, top, right)
par(mar=c(4, 4, 2, 2))
#pairs(raw_data_limited) Large gra
raw_data_cor_table <- cor(raw_data_limited, use = "complete.obs")

raw_data_cor_table[1:5, 1:5]
##           bhr.02_i0 bhr.02_i1 bhr.02_i2 bhr.02_v0 bhr.02_v1
## bhr.02_i0 1.0000000 0.9716989 0.8871227 0.8928627 0.9239195
## bhr.02_i1 0.9716989 1.0000000 0.9547929 0.7934465 0.8353361
## bhr.02_i2 0.8871227 0.9547929 1.0000000 0.6127897 0.6708893
## bhr.02_v0 0.8928627 0.7934465 0.6127897 1.0000000 0.9934911
## bhr.02_v1 0.9239195 0.8353361 0.6708893 0.9934911 1.0000000
raw_data_cor_table %>% gplots::heatmap.2(dendrogram="row", trace = "none", col=viridis::viridis(25, begin=.25), margins=c(7,8))

Exercise 6.Data Normalization

sample.description_raw_data <-tibble(sample=colnames(raw_data)[-1])

data <- sample.description_raw_data%>% mutate(
    Pop = str_extract(sample,"bhr|ihl|kc2|lv3|sq3|tm2|wl2|yo1"), Vern = str_extract(sample, "v0|v1|v2|i0|i1|i2"),
    group = str_c(Vern, Pop, sep = "_"))
print(head(data))
## # A tibble: 6 × 4
##   sample    Pop   Vern  group 
##   <chr>     <chr> <chr> <chr> 
## 1 bhr.02_i0 bhr   i0    i0_bhr
## 2 bhr.02_i1 bhr   i1    i1_bhr
## 3 bhr.02_i2 bhr   i2    i2_bhr
## 4 bhr.02_v0 bhr   v0    v0_bhr
## 5 bhr.02_v1 bhr   v1    v1_bhr
## 6 bhr.02_v2 bhr   v2    v2_bhr
#convert Elevation and Vernalization into factors for testing No vern,short vern, and long vern

sample.description_data <- data%>%
  mutate(Pop=factor(Pop), 
         Vern=factor(Vern,levels = c("v0","v1", "v2","i0","i1","i2"))) # setting the levels in this way makes "v0" the reference  

sample.description_data

Exercise 6. Normalization factor and plot

#calculate the effective library size and normalization factors using the TMM method
library(edgeR)
## Loading required package: limma
counts.matrix_raw_data <- raw_data %>% select(-geneID) %>% as.matrix()
rownames(counts.matrix_raw_data) <- raw_data$geneID

dge.data_Normalization <- DGEList(counts=counts.matrix_raw_data, 
                    group=sample.description_raw_data$group)
## Warning: Unknown or uninitialised column: `group`.
dim(dge.data_Normalization) 
## [1] 29983   206
dge.data_Normalization <- calcNormFactors(dge.data_Normalization, method = "TMM")
dge.data_Normalization$sample # look at the normalization factors
mds <- plotMDS(dge.data_Normalization, top = 5000, plot = FALSE, gene.selection = "common") 

mds <- cbind(sample.description_data, dim1=mds@.Data[[9]], dim2=mds@.Data[[10]])

mds %>% ggplot(aes(x=dim1, y=dim2, color=Pop, pch=Vern)) +
  geom_point() +
  ggtitle("MDS plot, 5000 most variable genes")

## make sure sample order is the same in the dge object and in the sample sheet
all(sample.description_data$sample==rownames(dge.data_Normalization$samples) )
## [1] TRUE

subset bhr

dge.data_Normalization.bhr <- dge.data_Normalization[,sample.description_data$Pop=="bhr"]
dge.data_Normalization.bhr$samples

Exercise 8

data_Normalization<-cpm(dge.data_Normalization)
data_Normalization_log <- log2(data_Normalization[,-1]+1)

design <- model.matrix(~Pop+Vern,data = sample.description_data)
rownames(design) <- sample.description_data$sample

head(design)
##           (Intercept) Popihl Popkc2 Poplv3 Popsq3 Poptm2 Popwl2 Popyo1 Vernv1
## bhr.02_i0           1      0      0      0      0      0      0      0      0
## bhr.02_i1           1      0      0      0      0      0      0      0      0
## bhr.02_i2           1      0      0      0      0      0      0      0      0
## bhr.02_v0           1      0      0      0      0      0      0      0      0
## bhr.02_v1           1      0      0      0      0      0      0      0      1
## bhr.02_v2           1      0      0      0      0      0      0      0      0
##           Vernv2 Verni0 Verni1 Verni2
## bhr.02_i0      0      1      0      0
## bhr.02_i1      0      0      1      0
## bhr.02_i2      0      0      0      1
## bhr.02_v0      0      0      0      0
## bhr.02_v1      0      0      0      0
## bhr.02_v2      1      0      0      0
dge.data_Normalization_dispersions <- estimateGLMCommonDisp(dge.data_Normalization,design,verbose = TRUE)
## Disp = 1.02231 , BCV = 1.0111
#dge.data_Normalization_dispersions <- estimateGLMTrendedDisp(dge.data_Normalization_dispersions,design)
dge.data_Normalization_dispersions <-estimateGLMTagwiseDisp(dge.data_Normalization_dispersions,design)
plotBCV(dge.data_Normalization_dispersions) 

Transcriptional

library(tibble)
library(ggplot2)
library(dplyr)

plotDE_line <- function(genes, dge.data_Normalization_dispersions, sample.description_raw_data) {
  tmp.data <- cpm(dge.data_Normalization_dispersions[genes,], log = TRUE, prior.count = 1)
  tmp.data <- as.data.frame(t(tmp.data))
  colnames(tmp.data) <- genes
  tmp.data <- tmp.data %>%
    rownames_to_column("sample") %>%
    left_join(sample.description_raw_data, by = "sample") %>%
    pivot_longer(cols = genes, names_to = "gene", values_to = "log2_cpm")
# creating a average for each range
  avg.data <- tmp.data %>%
    group_by(Vern, Pop, gene) %>%
    summarise(
      mean_log2_cpm = mean(log2_cpm),
      se = sd(log2_cpm) / sqrt(n()),  # Calculate standard error
      .groups = 'drop'
    )
  
  # delete the "#" below if you want to see errorbar
   pl <- ggplot(avg.data, aes(x = Vern, y = mean_log2_cpm, color = Pop, group = Pop, pch = Pop)) +
    geom_point() +  # Plots the averaged points
    geom_line() +  # Connects the points with a line
    #geom_errorbar(aes(ymin = mean_log2_cpm - se, ymax = mean_log2_cpm + se), width = 0.2) + 
    facet_wrap(~ gene) +
    ylab("Average log2(cpm)") +
    xlab("Genotype") +
     scale_color_viridis_d() 
  return(pl)
}
#FLC gene
plotDE_line("Sdiv_ptg000009l_0614-R",dge.data_Normalization_dispersions,sample.description_data)
## Warning: Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(genes)
## 
##   # Now:
##   data %>% select(all_of(genes))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The shape palette can deal with a maximum of 6 discrete values because more
## than 6 becomes difficult to discriminate
## ℹ you have requested 8 values. Consider specifying shapes manually if you need
##   that many have them.
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).

plotDE_line("Sdiv_ptg000010l_2315-R",dge.data_Normalization_dispersions,sample.description_data)
## Warning: The shape palette can deal with a maximum of 6 discrete values because more
## than 6 becomes difficult to discriminate
## ℹ you have requested 8 values. Consider specifying shapes manually if you need
##   that many have them.
## Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).

#plotDE_boxplot("Sdiv_ptg000009l_0614-R",dge.data_Normalization_dispersions,sample.description_data)
#plotDE_boxplot("Sdiv_ptg000010l_2315-R",dge.data_Normalization_dispersions,sample.description_data)

t -test for v2 from plotDE_line_Sdiv_ptg000009l_0614

H0: (meanV2) = (meanNotV2) HA: (meanV2) ≠ (meanNotV2)

Where (meanV2) is the mean log2(cpm) for the group where Vern = V2, and (meanNotV2)is the mean log2(cpm) for all other groups combined.

P value is 0.006145, reject H0 suggesting there is a statistical significant on the V2 group on average, while using alpha = 0.05

#Average gene and t test for FLC Sdiv_ptg000009l_0614
tmp.data_Sdiv_ptg000009l_0614 <- cpm(dge.data_Normalization_dispersions["Sdiv_ptg000009l_0614-R",], log = TRUE, prior.count = 1)
tmp.data_Sdiv_ptg000009l_0614 <- as.data.frame(t(tmp.data_Sdiv_ptg000009l_0614))
colnames(tmp.data_Sdiv_ptg000009l_0614) <- "Sdiv_ptg000009l_0614-R"
tmp.data_Sdiv_ptg000009l_0614 <- tmp.data_Sdiv_ptg000009l_0614 %>%
    rownames_to_column("sample") %>%
    left_join(sample.description_raw_data, by = "sample") %>%
    pivot_longer(cols = "Sdiv_ptg000009l_0614-R", names_to = "gene", values_to = "log2_cpm")
tmp.data_Sdiv_ptg000009l_0614 <- left_join(tmp.data_Sdiv_ptg000009l_0614, data, by = "sample")


avg_log2_cpm_Sdiv_ptg000009l_0614 <- tmp.data_Sdiv_ptg000009l_0614 %>%
  group_by(Pop, Vern) %>%
  summarise(
    avg_log2_cpm = mean(log2_cpm, na.rm = TRUE),  # Calculate average, ignoring NA values
    .groups = 'drop'  # Drop the grouping structure after summarization
  )

print(avg_log2_cpm_Sdiv_ptg000009l_0614)
## # A tibble: 48 × 3
##    Pop   Vern  avg_log2_cpm
##    <chr> <chr>        <dbl>
##  1 bhr   i0            5.51
##  2 bhr   i1            5.49
##  3 bhr   i2            4.69
##  4 bhr   v0            5.28
##  5 bhr   v1            4.40
##  6 bhr   v2            2.76
##  7 ihl   i0            5.37
##  8 ihl   i1            4.43
##  9 ihl   i2            2.40
## 10 ihl   v0            5.22
## # ℹ 38 more rows
# T-Test
# Create two groups
group_V2_Sdiv_ptg000009l_0614 <- avg_log2_cpm_Sdiv_ptg000009l_0614$avg_log2_cpm[avg_log2_cpm_Sdiv_ptg000009l_0614$Vern == "v2"]


group_not_V2_Sdiv_ptg000009l_0614 <- avg_log2_cpm_Sdiv_ptg000009l_0614$avg_log2_cpm[avg_log2_cpm_Sdiv_ptg000009l_0614$Vern != "v2"]


t_test_result_Sdiv_ptg000009l_0614 <- t.test(group_V2_Sdiv_ptg000009l_0614, group_not_V2_Sdiv_ptg000009l_0614, alternative = "two.sided", var.equal = FALSE)

print(t_test_result_Sdiv_ptg000009l_0614)
## 
##  Welch Two Sample t-test
## 
## data:  group_V2_Sdiv_ptg000009l_0614 and group_not_V2_Sdiv_ptg000009l_0614
## t = -3.6098, df = 8.5636, p-value = 0.006145
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -4.518563 -1.020362
## sample estimates:
## mean of x mean of y 
##  1.463795  4.233258

t -test for v2 from plotDE_line_Sdiv_ptg000010l_2315

H0: (meanV2) = (meanNotV2) HA: (meanV2) ≠ (meanNotV2)

Where (meanV2) is the mean log2(cpm) for the group where Vern = V2, and (meanNotV2)is the mean log2(cpm) for all other groups combined.

The P value is 0.07004, therefore, we fail to reject H0 suggesting V2 group is not statistical significant than other groups, while using alpha = 0.05

#Average gene and t test for FLC Sdiv_ptg000010l_2315-R


tmp.data_Sdiv_ptg000010l_2315 <- cpm(dge.data_Normalization_dispersions["Sdiv_ptg000010l_2315-R",], log = TRUE, prior.count = 1)
tmp.data_Sdiv_ptg000010l_2315 <- as.data.frame(t(tmp.data_Sdiv_ptg000010l_2315))
colnames(tmp.data_Sdiv_ptg000010l_2315) <- "Sdiv_ptg000010l_2315-R"
tmp.data_Sdiv_ptg000010l_2315 <- tmp.data_Sdiv_ptg000010l_2315 %>%
    rownames_to_column("sample") %>%
    left_join(sample.description_raw_data, by = "sample") %>%
    pivot_longer(cols = "Sdiv_ptg000010l_2315-R", names_to = "gene", values_to = "log2_cpm")
tmp.data_Sdiv_ptg000010l_2315 <- left_join(tmp.data_Sdiv_ptg000010l_2315, data, by = "sample")


avg_log2_cpm_Sdiv_ptg000010l_2315 <- tmp.data_Sdiv_ptg000010l_2315 %>%
  group_by(Pop, Vern) %>%
  summarise(
    avg_log2_cpm = mean(log2_cpm, na.rm = TRUE),  # Calculate average, ignoring NA values
    .groups = 'drop'  # Drop the grouping structure after summarization
  )

print(avg_log2_cpm_Sdiv_ptg000010l_2315)
## # A tibble: 48 × 3
##    Pop   Vern  avg_log2_cpm
##    <chr> <chr>        <dbl>
##  1 bhr   i0            4.72
##  2 bhr   i1            4.77
##  3 bhr   i2            4.35
##  4 bhr   v0            3.92
##  5 bhr   v1            4.16
##  6 bhr   v2            3.68
##  7 ihl   i0            4.62
##  8 ihl   i1            4.68
##  9 ihl   i2            2.78
## 10 ihl   v0            3.80
## # ℹ 38 more rows
# T-Test
# Create two groups
group_V2_Sdiv_ptg000010l_2315 <- avg_log2_cpm_Sdiv_ptg000010l_2315$avg_log2_cpm[avg_log2_cpm_Sdiv_ptg000010l_2315$Vern == "v2"]


group_not_V2_Sdiv_ptg000010l_2315 <- avg_log2_cpm_Sdiv_ptg000010l_2315$avg_log2_cpm[avg_log2_cpm_Sdiv_ptg000010l_2315$Vern != "v2"]


t_test_result_Sdiv_ptg000010l_2315 <- t.test(group_V2_Sdiv_ptg000010l_2315, group_not_V2_Sdiv_ptg000010l_2315, alternative = "two.sided", var.equal = FALSE)

print(t_test_result_Sdiv_ptg000010l_2315)
## 
##  Welch Two Sample t-test
## 
## data:  group_V2_Sdiv_ptg000010l_2315 and group_not_V2_Sdiv_ptg000010l_2315
## t = -2.0446, df = 9.3574, p-value = 0.07004
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -2.3537675  0.1120589
## sample estimates:
## mean of x mean of y 
##  2.737856  3.858710

V2 FLC Different Elevation Testing Sdiv_ptg000009l_0614

library(dplyr)
library(ggplot2)

tmp.data_Sdiv_ptg000009l_0614 <- tmp.data_Sdiv_ptg000009l_0614 %>%
  mutate(elevation = ifelse(Pop %in% c("tm", "ihl", "bhr", "kc"), "low", 
                            ifelse(Pop %in% c("lv", "wl", "yo", "sq"), "high", NA)))

V2_tmp.data_Sdiv_ptg000009l_0614 <- tmp.data_Sdiv_ptg000009l_0614 %>%
  filter(Vern == "v2")


ggplot(V2_tmp.data_Sdiv_ptg000009l_0614, aes(x=elevation, y=log2_cpm)) +
  geom_boxplot() +
  labs(title="Boxplot of FLC level by Elevation and Vern",
       x="Elevation",
       y="Log2 CPM",
       fill="Vern Group") +
  theme_minimal()

V2 FLC Different Elevation Testing Sdiv_ptg000010l_2315

library(dplyr)
library(ggplot2)

tmp.data_Sdiv_ptg000010l_2315 <- tmp.data_Sdiv_ptg000010l_2315 %>%
  mutate(elevation = ifelse(Pop %in% c("tm", "ihl", "bhr", "kc"), "low", 
                            ifelse(Pop %in% c("lv", "wl", "yo", "sq"), "high", NA)))

V2_tmp.data_Sdiv_ptg000010l_2315 <- tmp.data_Sdiv_ptg000010l_2315 %>%
  filter(Vern == "v2")


ggplot(V2_tmp.data_Sdiv_ptg000010l_2315, aes(x=elevation, y=log2_cpm)) +
  geom_boxplot() +
  labs(title="Boxplot of FLC level by Elevation and Vern",
       x="Elevation",
       y="Log2 CPM",
       fill="Vern Group") +
  theme_minimal()

Function for Separate graph Sdiv_ptg000010l_2315

library(patchwork)

#V0I0 group
V0I0_tmp.data_Sdiv_ptg000010l_2315 <- tmp.data_Sdiv_ptg000010l_2315 %>%
  filter(Vern == "v0" | Vern == "i0")

V0I0_avg_log2_cpm_Sdiv_ptg000010l_2315 <- V0I0_tmp.data_Sdiv_ptg000010l_2315 %>%
  group_by(Pop, Vern) %>%
  summarise(
    avg_log2_cpm = mean(log2_cpm, na.rm = TRUE),  # Calculate average, ignoring NA values
    .groups = 'drop'  # Drop the grouping structure after summarization
  )

# Manually set the factor levels to control order
V0I0_avg_log2_cpm_Sdiv_ptg000010l_2315$Vern <- factor(V0I0_avg_log2_cpm_Sdiv_ptg000010l_2315$Vern, 
                                                      levels = c("v0", "i0"))


pl2315_1 <- ggplot(V0I0_avg_log2_cpm_Sdiv_ptg000010l_2315, aes(x = Vern, y = avg_log2_cpm, color = Pop, group = Pop, shape = Pop)) +
    geom_point() +  # Plots the averaged points
    geom_line() +  # Connects the points within each group with lines
    labs(title = "Sdiv_ptg000010l_2315",
      x = "Treatment",
      y = "Mean log2(cpm) Expression"
    ) +
    scale_color_viridis_d()  # Apply the viridis discrete color scale for aesthetic mapping


#V1I1 group
V1I1_tmp.data_Sdiv_ptg000010l_2315 <- tmp.data_Sdiv_ptg000010l_2315 %>%
  filter(Vern == "v0" | Vern == "i1"| Vern == "v1")

V1I1_avg_log2_cpm_Sdiv_ptg000010l_2315 <- V1I1_tmp.data_Sdiv_ptg000010l_2315 %>%
  group_by(Pop, Vern) %>%
  summarise(
    avg_log2_cpm = mean(log2_cpm, na.rm = TRUE),  # Calculate average, ignoring NA values
    .groups = 'drop'  # Drop the grouping structure after summarization
  )

# Manually set the factor levels to control order
V1I1_avg_log2_cpm_Sdiv_ptg000010l_2315$Vern <- factor(V1I1_avg_log2_cpm_Sdiv_ptg000010l_2315$Vern, 
                                                      levels = c("v0", "v1", "i1"))

pl2315_2 <- ggplot(V1I1_avg_log2_cpm_Sdiv_ptg000010l_2315, aes(x = Vern, y = avg_log2_cpm, color = Pop, group = Pop, shape = Pop)) +
    geom_point() +  # Plots the averaged points with different shapes per group
    geom_line() +  # Connects the points within each group with lines
    labs(title = "Sdiv_ptg000010l_2315",
      x = "Treatment",
      y = "Mean_log2_cpm Expression",
    ) +
    scale_color_viridis_d()  # Apply the viridis discrete color scale for aesthetic mapping

#V2I2 group
V2I2_tmp.data_Sdiv_ptg000010l_2315 <- tmp.data_Sdiv_ptg000010l_2315 %>%
  filter(Vern == "v0" | Vern == "i2"| Vern == "v2")

V2I2_avg_log2_cpm_Sdiv_ptg000010l_2315 <- V2I2_tmp.data_Sdiv_ptg000010l_2315 %>%
  group_by(Pop, Vern) %>%
  summarise(
    avg_log2_cpm = mean(log2_cpm, na.rm = TRUE),  # Calculate average, ignoring NA values
    .groups = 'drop'  # Drop the grouping structure after summarization
  )

# Manually set the factor levels to control order
V2I2_avg_log2_cpm_Sdiv_ptg000010l_2315$Vern <- factor(V2I2_avg_log2_cpm_Sdiv_ptg000010l_2315$Vern, 
                                                      levels = c("v0", "v2", "i2"))

pl2315_3 <- ggplot(V2I2_avg_log2_cpm_Sdiv_ptg000010l_2315, aes(x = Vern, y = avg_log2_cpm, color = Pop, group = Pop, shape = Pop)) +
    geom_point() +  # Plots the averaged points with different shapes per group
    geom_line() +  # Connects the points within each group with lines
    labs(title = "Sdiv_ptg000010l_2315",
      x = "Treatment",
      y = "Mean_log2_cpm Expression",
    ) +
    scale_color_viridis_d()  # Apply the viridis discrete color scale for aesthetic mapping

pl2315_1 / pl2315_2 / pl2315_3 + plot_layout(guides = "collect", axis_titles="collect")
## Warning: The shape palette can deal with a maximum of 6 discrete values because more
## than 6 becomes difficult to discriminate
## ℹ you have requested 8 values. Consider specifying shapes manually if you need
##   that many have them.
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: The shape palette can deal with a maximum of 6 discrete values because more
## than 6 becomes difficult to discriminate
## ℹ you have requested 8 values. Consider specifying shapes manually if you need
##   that many have them.
## Warning: Removed 6 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: The shape palette can deal with a maximum of 6 discrete values because more
## than 6 becomes difficult to discriminate
## ℹ you have requested 8 values. Consider specifying shapes manually if you need
##   that many have them.
## Warning: Removed 6 rows containing missing values or values outside the scale range
## (`geom_point()`).

Function for Separate graph Sdiv_ptg000009l_0614

#V0I0 group
V0I0_tmp.data_Sdiv_ptg000009l_0614 <- tmp.data_Sdiv_ptg000009l_0614 %>%
  filter(Vern == "v0" | Vern == "i0")

V0I0_avg_log2_cpm_Sdiv_ptg000009l_0614 <- V0I0_tmp.data_Sdiv_ptg000009l_0614 %>%
  group_by(Pop, Vern) %>%
  summarise(
    avg_log2_cpm = mean(log2_cpm, na.rm = TRUE),  # Calculate average, ignoring NA values
    .groups = 'drop'  # Drop the grouping structure after summarization
  )

# Manually set the factor levels to control order
V0I0_avg_log2_cpm_Sdiv_ptg000009l_0614$Vern <- factor(V0I0_avg_log2_cpm_Sdiv_ptg000009l_0614$Vern, 
                                                      levels = c("v0", "i0"))


pl614_1 <- ggplot(V0I0_avg_log2_cpm_Sdiv_ptg000009l_0614, aes(x = Vern, y = avg_log2_cpm, color = Pop, group = Pop, shape = Pop)) +
    geom_point() +  # Plots the averaged points
    geom_line() +  # Connects the points within each group with lines
    labs(
      x = "Treatment",
      y = "Mean log2(cpm) Expression"
    ) +
    scale_color_viridis_d()  # Apply the viridis discrete color scale for aesthetic mapping


#V1I1 group
V1I1_tmp.data_Sdiv_ptg000009l_0614 <- tmp.data_Sdiv_ptg000009l_0614 %>%
  filter(Vern == "v0" | Vern == "i1"| Vern == "v1")

V1I1_avg_log2_cpm_Sdiv_ptg000009l_0614 <- V1I1_tmp.data_Sdiv_ptg000009l_0614 %>%
  group_by(Pop, Vern) %>%
  summarise(
    avg_log2_cpm = mean(log2_cpm, na.rm = TRUE),  # Calculate average, ignoring NA values
    .groups = 'drop'  # Drop the grouping structure after summarization
  )

# Manually set the factor levels to control order
V1I1_avg_log2_cpm_Sdiv_ptg000009l_0614$Vern <- factor(V1I1_avg_log2_cpm_Sdiv_ptg000009l_0614$Vern, 
                                                      levels = c("v0", "v1", "i1"))

pl614_2 <- ggplot(V1I1_avg_log2_cpm_Sdiv_ptg000009l_0614, aes(x = Vern, y = avg_log2_cpm, color = Pop, group = Pop, shape = Pop)) +
    geom_point() +  # Plots the averaged points with different shapes per group
    geom_line() +  # Connects the points within each group with lines
    labs(
      x = "Treatment",
      y = "Mean_log2_cpm Expression",
    ) +
    scale_color_viridis_d()  # Apply the viridis discrete color scale for aesthetic mapping

#V2I2 group
V2I2_tmp.data_Sdiv_ptg000009l_0614 <- tmp.data_Sdiv_ptg000009l_0614 %>%
  filter(Vern == "v0" | Vern == "i2"| Vern == "v2")

V2I2_avg_log2_cpm_Sdiv_ptg000009l_0614 <- V2I2_tmp.data_Sdiv_ptg000009l_0614 %>%
  group_by(Pop, Vern) %>%
  summarise(
    avg_log2_cpm = mean(log2_cpm, na.rm = TRUE),  # Calculate average, ignoring NA values
    .groups = 'drop'  # Drop the grouping structure after summarization
  )

# Manually set the factor levels to control order
V2I2_avg_log2_cpm_Sdiv_ptg000009l_0614$Vern <- factor(V2I2_avg_log2_cpm_Sdiv_ptg000009l_0614$Vern, 
                                                      levels = c("v0", "v2", "i2"))

pl614_3 <- ggplot(V2I2_avg_log2_cpm_Sdiv_ptg000009l_0614, aes(x = Vern, y = avg_log2_cpm, color = Pop, group = Pop, shape = Pop)) +
    geom_point() +  # Plots the averaged points with different shapes per group
    geom_line() +  # Connects the points within each group with lines
    labs(
      x = "Treatment",
      y = "Mean_log2_cpm Expression",
    ) +
    scale_color_viridis_d()  # Apply the viridis discrete color scale for aesthetic mapping

pl614_1 / pl614_2 / pl614_3 + plot_layout(guides = "collect", axis_titles="collect")
## Warning: The shape palette can deal with a maximum of 6 discrete values because more
## than 6 becomes difficult to discriminate
## ℹ you have requested 8 values. Consider specifying shapes manually if you need
##   that many have them.
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: The shape palette can deal with a maximum of 6 discrete values because more
## than 6 becomes difficult to discriminate
## ℹ you have requested 8 values. Consider specifying shapes manually if you need
##   that many have them.
## Warning: Removed 6 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: The shape palette can deal with a maximum of 6 discrete values because more
## than 6 becomes difficult to discriminate
## ℹ you have requested 8 values. Consider specifying shapes manually if you need
##   that many have them.
## Warning: Removed 6 rows containing missing values or values outside the scale range
## (`geom_point()`).

Functions for separate graphs

library(dplyr)
library(ggplot2)
library(patchwork)


plot_gene_average_expression <- function(data, gene_id, vern_levels, pop_levels) {
  # Filtering and summarizing data
  summarized_data <- data %>%
    filter(Vern %in% vern_levels) %>%
    group_by(Pop, Vern) %>%
    summarise(avg_log2_cpm = mean(log2_cpm, na.rm = TRUE), .groups = 'drop') %>%
    mutate(Vern = factor(Vern, levels = vern_levels))
  

  # Create the plot
  plot <- ggplot(summarized_data, aes(x = Vern, y = avg_log2_cpm, color = Pop, group = Pop, shape = Pop)) +
    geom_point() +  
    geom_line() +  
    labs(
      title = paste("Average expression", gene_id),
      x = "Treatment",
      y = "Mean log2(cpm) Expression"
    ) +
    scale_color_viridis_d()  

  return(plot)
}

graphs using functions

# Example "Sdiv_ptg000009l_0614"

plot1_Sdiv_ptg000009l_0614 <- plot_gene_average_expression(tmp.data_Sdiv_ptg000009l_0614, "Sdiv_ptg000009l_0614", c("v0", "i0"), unique(tmp.data_Sdiv_ptg000009l_0614$Pop))
plot2_Sdiv_ptg000009l_0614 <- plot_gene_average_expression(tmp.data_Sdiv_ptg000009l_0614, "Sdiv_ptg000009l_0614", c("v0", "v1", "i1"), unique(tmp.data_Sdiv_ptg000009l_0614$Pop))
plot3_Sdiv_ptg000009l_0614 <- plot_gene_average_expression(tmp.data_Sdiv_ptg000009l_0614, "Sdiv_ptg000009l_0614", c("v0", "v2", "i2"), unique(tmp.data_Sdiv_ptg000009l_0614$Pop))

final_plot_Sdiv_ptg000009l_0614 <- plot1_Sdiv_ptg000009l_0614 / plot2_Sdiv_ptg000009l_0614 / plot3_Sdiv_ptg000009l_0614 + plot_layout(guides = "collect") + plot_annotation(title = " Gene Sdiv_ptg000009l_0614")

print(final_plot_Sdiv_ptg000009l_0614)
## Warning: The shape palette can deal with a maximum of 6 discrete values because more
## than 6 becomes difficult to discriminate
## ℹ you have requested 8 values. Consider specifying shapes manually if you need
##   that many have them.
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: The shape palette can deal with a maximum of 6 discrete values because more
## than 6 becomes difficult to discriminate
## ℹ you have requested 8 values. Consider specifying shapes manually if you need
##   that many have them.
## Warning: Removed 6 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: The shape palette can deal with a maximum of 6 discrete values because more
## than 6 becomes difficult to discriminate
## ℹ you have requested 8 values. Consider specifying shapes manually if you need
##   that many have them.
## Warning: Removed 6 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Example Sdiv_ptg000010l_2315

plot1_Sdiv_ptg000010l_2315 <- plot_gene_average_expression(tmp.data_Sdiv_ptg000010l_2315, "Sdiv_ptg000010l_2315", c("v0", "i0"), unique(tmp.data_Sdiv_ptg000010l_2315$Pop))
plot2_Sdiv_ptg000010l_2315 <- plot_gene_average_expression(tmp.data_Sdiv_ptg000010l_2315, "Sdiv_ptg000010l_2315", c("v0", "v1", "i1"), unique(tmp.data_Sdiv_ptg000010l_2315$Pop))
plot3_Sdiv_ptg000010l_2315 <- plot_gene_average_expression(tmp.data_Sdiv_ptg000010l_2315, "Sdiv_ptg000010l_2315", c("v0", "v2", "i2"), unique(tmp.data_Sdiv_ptg000010l_2315$Pop))

final_plot_Sdiv_ptg000010l_2315 <- plot1_Sdiv_ptg000010l_2315 / plot2_Sdiv_ptg000010l_2315 / plot3_Sdiv_ptg000010l_2315 + plot_layout(guides = "collect") + plot_annotation(title = " Gene Sdiv_ptg000010l_2315")

print(final_plot_Sdiv_ptg000010l_2315)
## Warning: The shape palette can deal with a maximum of 6 discrete values because more
## than 6 becomes difficult to discriminate
## ℹ you have requested 8 values. Consider specifying shapes manually if you need
##   that many have them.
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: The shape palette can deal with a maximum of 6 discrete values because more
## than 6 becomes difficult to discriminate
## ℹ you have requested 8 values. Consider specifying shapes manually if you need
##   that many have them.
## Warning: Removed 6 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: The shape palette can deal with a maximum of 6 discrete values because more
## than 6 becomes difficult to discriminate
## ℹ you have requested 8 values. Consider specifying shapes manually if you need
##   that many have them.
## Warning: Removed 6 rows containing missing values or values outside the scale range
## (`geom_point()`).

subset bhr single comparsion v0 vs v1

dge.data_Normalization.bhr <- dge.data_Normalization [,sample.description_data $Pop=="bhr"]
dge.data_Normalization.bhr$samples
sample.description_data.bhr <- sample.description_data %>%
  filter(Pop == "bhr")


design.bhr <- model.matrix(~Vern,data = sample.description_data.bhr)
rownames(design.bhr) <- sample.description_data.bhr$samples
## Warning: Unknown or uninitialised column: `samples`.
design.bhr
##       (Intercept) Vernv1 Vernv2 Verni0 Verni1 Verni2
##  [1,]           1      0      0      1      0      0
##  [2,]           1      0      0      0      1      0
##  [3,]           1      0      0      0      0      1
##  [4,]           1      0      0      0      0      0
##  [5,]           1      1      0      0      0      0
##  [6,]           1      0      1      0      0      0
##  [7,]           1      0      0      1      0      0
##  [8,]           1      0      0      0      1      0
##  [9,]           1      0      0      0      0      1
## [10,]           1      0      0      0      0      0
## [11,]           1      1      0      0      0      0
## [12,]           1      0      1      0      0      0
## [13,]           1      0      0      1      0      0
## [14,]           1      0      0      0      1      0
## [15,]           1      0      0      0      0      1
## [16,]           1      0      0      0      0      0
## [17,]           1      1      0      0      0      0
## [18,]           1      0      1      0      0      0
## [19,]           1      0      0      1      0      0
## [20,]           1      0      0      0      1      0
## [21,]           1      0      0      0      0      1
## [22,]           1      0      0      0      0      0
## [23,]           1      1      0      0      0      0
## [24,]           1      0      1      0      0      0
## [25,]           1      0      0      1      0      0
## [26,]           1      0      0      0      1      0
## [27,]           1      0      0      0      0      1
## [28,]           1      0      0      0      0      0
## [29,]           1      1      0      0      0      0
## [30,]           1      0      1      0      0      0
## attr(,"assign")
## [1] 0 1 1 1 1 1
## attr(,"contrasts")
## attr(,"contrasts")$Vern
## [1] "contr.treatment"
dge.data_Normalization_dispersions.bhr <- estimateGLMCommonDisp(dge.data_Normalization.bhr,design.bhr,verbose = TRUE)
## Disp = 0.29412 , BCV = 0.5423
fit.bhr <- glmFit(dge.data_Normalization_dispersions.bhr,design.bhr)
gt.lrt.bhr <- glmLRT(fit.bhr,coef="Vernv1")
result_v0v1_bhr<-summary(decideTests(gt.lrt.bhr,p=0.01))
print(result_v0v1_bhr)
##        Vernv1
## Down      298
## NotSig  29088
## Up        597

subset yo single comparsion

dge.data_Normalization.yo1 <- dge.data_Normalization [,sample.description_data $Pop=="yo1"]
dge.data_Normalization.yo1$samples
sample.description_data.yo1 <- sample.description_data %>%
  filter(Pop == "yo1")
sample.description_data.yo1
design.yo1 <- model.matrix(~Vern,data = sample.description_data.yo1)
rownames(design.yo1) <- sample.description_data.yo1$samples
## Warning: Unknown or uninitialised column: `samples`.
design.yo1
##       (Intercept) Vernv1 Vernv2 Verni0 Verni1 Verni2
##  [1,]           1      0      0      1      0      0
##  [2,]           1      0      0      0      1      0
##  [3,]           1      0      0      0      0      1
##  [4,]           1      0      0      0      0      0
##  [5,]           1      1      0      0      0      0
##  [6,]           1      0      1      0      0      0
##  [7,]           1      0      0      1      0      0
##  [8,]           1      0      0      0      1      0
##  [9,]           1      0      0      0      0      0
## [10,]           1      1      0      0      0      0
## [11,]           1      0      1      0      0      0
## [12,]           1      0      0      1      0      0
## [13,]           1      0      0      0      0      1
## [14,]           1      0      0      0      0      0
## [15,]           1      1      0      0      0      0
## [16,]           1      0      1      0      0      0
## [17,]           1      0      0      1      0      0
## [18,]           1      0      0      0      0      1
## [19,]           1      0      0      0      0      0
## [20,]           1      1      0      0      0      0
## [21,]           1      0      1      0      0      0
## [22,]           1      0      0      1      0      0
## [23,]           1      0      0      0      1      0
## [24,]           1      0      0      0      0      1
## [25,]           1      0      0      0      0      0
## [26,]           1      1      0      0      0      0
## [27,]           1      0      1      0      0      0
## attr(,"assign")
## [1] 0 1 1 1 1 1
## attr(,"contrasts")
## attr(,"contrasts")$Vern
## [1] "contr.treatment"
dge.data_Normalization_dispersions.yo1 <- estimateGLMCommonDisp(dge.data_Normalization.yo1,design.yo1,verbose = TRUE)
## Disp = 1.2375 , BCV = 1.1124
fit.yo1 <- glmFit(dge.data_Normalization_dispersions.yo1,design.yo1)
gt.lrt.yo1 <- glmLRT(fit.yo1,coef="Vernv1")
result_v0v1.yo1<-summary(decideTests(gt.lrt.yo1,p=0.01))

print(result_v0v1.yo1)
##        Vernv1
## Down      270
## NotSig  29373
## Up        340

Loop for each population (vo vs v1)

Population Down NotSig Up
bhr 298 29088 597
ihl 203 28977 803
kc 282 28978 723
lv 203 28458 1322
sq 637 29302 44
tm 1521 27924 538
wl 112 29533 338
yo 270 29373 340
pop_groups <- unique(sample.description_data$Pop)
results_list <- list()

for (pop in pop_groups) {

  #Extract the Normalization data for each population
  dge.data_Normalization_pop <- dge.data_Normalization[, sample.description_data$Pop == pop]
  #Extract sample description data for each population
  sample.description_data_pop <- sample.description_data %>% filter(Pop == pop)
  #redesign the matrix with correct # column for each population
  design_pop <- model.matrix(~Vern, data = sample.description_data_pop)
  rownames(design_pop) <- sample.description_data_pop$samples
  #using matrix to find out the Normalization dispersions for each population
  dge.data_Normalization_dispersions_pop <- estimateGLMCommonDisp(dge.data_Normalization_pop, design_pop, verbose = TRUE)
  
  #Fit the model for each population dispersion data
  fit_pop <- glmFit(dge.data_Normalization_dispersions_pop, design_pop)
  # Conduct likelihood ratio test for each population
  gt.lrt_pop <- glmLRT(fit_pop, coef = "Vernv1")
  
  # Summarize the results for each population (up / down regulation)
  result_v0v1_pop <- summary(decideTests(gt.lrt_pop, p = 0.01))
  description <- "v0v1 comparison for population: " 
  results_list[[pop]] <- list(
  Description = paste(description, pop, sep=""),
  Results = result_v0v1_pop)
  }
## Warning: Unknown or uninitialised column: `samples`.
## Disp = 0.29412 , BCV = 0.5423
## Warning: Unknown or uninitialised column: `samples`.
## Disp = 0.15282 , BCV = 0.3909
## Warning: Unknown or uninitialised column: `samples`.
## Disp = 0.41909 , BCV = 0.6474
## Warning: Unknown or uninitialised column: `samples`.
## Disp = 0.78411 , BCV = 0.8855
## Warning: Unknown or uninitialised column: `samples`.
## Disp = 1.80391 , BCV = 1.3431
## Warning: Unknown or uninitialised column: `samples`.
## Disp = 1.24269 , BCV = 1.1148
## Warning: Unknown or uninitialised column: `samples`.
## Disp = 1.04389 , BCV = 1.0217
## Warning: Unknown or uninitialised column: `samples`.
## Disp = 1.2375 , BCV = 1.1124
# Print or return results
print(results_list)
## $bhr
## $bhr$Description
## [1] "v0v1 comparison for population: bhr"
## 
## $bhr$Results
##        Vernv1
## Down      298
## NotSig  29088
## Up        597
## 
## 
## $ihl
## $ihl$Description
## [1] "v0v1 comparison for population: ihl"
## 
## $ihl$Results
##        Vernv1
## Down      203
## NotSig  28977
## Up        803
## 
## 
## $kc2
## $kc2$Description
## [1] "v0v1 comparison for population: kc2"
## 
## $kc2$Results
##        Vernv1
## Down      282
## NotSig  28978
## Up        723
## 
## 
## $lv3
## $lv3$Description
## [1] "v0v1 comparison for population: lv3"
## 
## $lv3$Results
##        Vernv1
## Down      203
## NotSig  28458
## Up       1322
## 
## 
## $sq3
## $sq3$Description
## [1] "v0v1 comparison for population: sq3"
## 
## $sq3$Results
##        Vernv1
## Down      637
## NotSig  29302
## Up         44
## 
## 
## $tm2
## $tm2$Description
## [1] "v0v1 comparison for population: tm2"
## 
## $tm2$Results
##        Vernv1
## Down     1530
## NotSig  27905
## Up        548
## 
## 
## $wl2
## $wl2$Description
## [1] "v0v1 comparison for population: wl2"
## 
## $wl2$Results
##        Vernv1
## Down      112
## NotSig  29533
## Up        338
## 
## 
## $yo1
## $yo1$Description
## [1] "v0v1 comparison for population: yo1"
## 
## $yo1$Results
##        Vernv1
## Down      270
## NotSig  29373
## Up        340

Loop for each population (vo vs v2)

Population Down NotSig Up
bhr 530 28489 964
ihl 368 28630 985
kc 105 29197 681
lv 1081 27785 1117
sq 707 29161 115
tm 35 29708 240
wl 223 29277 483
yo 126 29297 560
pop_groups <- unique(sample.description_data$Pop)
results_list <- list()

for (pop in pop_groups) {

  #Extract the Normalization data for each population
  dge.data_Normalization_pop <- dge.data_Normalization[, sample.description_data$Pop == pop]
  #Extract sample description data for each population
  sample.description_data_pop <- sample.description_data %>% filter(Pop == pop)
  #redesign the matrix with correct # column for each population
  design_pop <- model.matrix(~Vern, data = sample.description_data_pop)
  rownames(design_pop) <- sample.description_data_pop$samples
  #using matrix to find out the Normalization dispersions for each population
  dge.data_Normalization_dispersions_pop <- estimateGLMCommonDisp(dge.data_Normalization_pop, design_pop, verbose = TRUE)
  
  #Fit the model for each population dispersion data
  fit_pop <- glmFit(dge.data_Normalization_dispersions_pop, design_pop)
  # Conduct likelihood ratio test for each population
  gt.lrt_pop <- glmLRT(fit_pop, coef = "Vernv2")
  
  # Summarize the results for each population (up / down regulation)
  result_v0v2_pop <- summary(decideTests(gt.lrt_pop, p = 0.01))
  description <- "v0v2 comparison for population: " 
  results_list[[pop]] <- list(
  Description = paste(description, pop, sep=""),
  Results = result_v0v2_pop)
  }
## Warning: Unknown or uninitialised column: `samples`.
## Disp = 0.29412 , BCV = 0.5423
## Warning: Unknown or uninitialised column: `samples`.
## Disp = 0.15282 , BCV = 0.3909
## Warning: Unknown or uninitialised column: `samples`.
## Disp = 0.41909 , BCV = 0.6474
## Warning: Unknown or uninitialised column: `samples`.
## Disp = 0.78411 , BCV = 0.8855
## Warning: Unknown or uninitialised column: `samples`.
## Disp = 1.80391 , BCV = 1.3431
## Warning: Unknown or uninitialised column: `samples`.
## Disp = 1.24269 , BCV = 1.1148
## Warning: Unknown or uninitialised column: `samples`.
## Disp = 1.04389 , BCV = 1.0217
## Warning: Unknown or uninitialised column: `samples`.
## Disp = 1.2375 , BCV = 1.1124
# Print or return results
print(results_list)
## $bhr
## $bhr$Description
## [1] "v0v2 comparison for population: bhr"
## 
## $bhr$Results
##        Vernv2
## Down      530
## NotSig  28489
## Up        964
## 
## 
## $ihl
## $ihl$Description
## [1] "v0v2 comparison for population: ihl"
## 
## $ihl$Results
##        Vernv2
## Down      368
## NotSig  28630
## Up        985
## 
## 
## $kc2
## $kc2$Description
## [1] "v0v2 comparison for population: kc2"
## 
## $kc2$Results
##        Vernv2
## Down      105
## NotSig  29197
## Up        681
## 
## 
## $lv3
## $lv3$Description
## [1] "v0v2 comparison for population: lv3"
## 
## $lv3$Results
##        Vernv2
## Down     1081
## NotSig  27785
## Up       1117
## 
## 
## $sq3
## $sq3$Description
## [1] "v0v2 comparison for population: sq3"
## 
## $sq3$Results
##        Vernv2
## Down      707
## NotSig  29161
## Up        115
## 
## 
## $tm2
## $tm2$Description
## [1] "v0v2 comparison for population: tm2"
## 
## $tm2$Results
##        Vernv2
## Down       37
## NotSig  29706
## Up        240
## 
## 
## $wl2
## $wl2$Description
## [1] "v0v2 comparison for population: wl2"
## 
## $wl2$Results
##        Vernv2
## Down      223
## NotSig  29277
## Up        483
## 
## 
## $yo1
## $yo1$Description
## [1] "v0v2 comparison for population: yo1"
## 
## $yo1$Results
##        Vernv2
## Down      126
## NotSig  29297
## Up        560

Loop for each population (v1 vs v2)

Population Down NotSig Up
bhr 468 29029 486
ihl 374 29304 305
kc 19 29778 186
lv 1220 28555 208
sq 195 29605 183
tm 0 29940 43
wl 108 29689 186
yo 87 29465 431
  1. create a new reference for v1
sample.description_data_v1_reference <- data%>%
  mutate(Pop=factor(Pop), 
         Vern=factor(Vern,levels = c("v1","v0", "v2","i0","i1","i2"))) # setting the levels in this way makes "v1" the reference  

sample.description_data_v1_reference
  1. loop for comparison
pop_groups <- unique(sample.description_data_v1_reference$Pop)
results_list <- list()

for (pop in pop_groups) {

  #Extract the Normalization data for each population
  dge.data_Normalization_pop <- dge.data_Normalization[, sample.description_data_v1_reference$Pop == pop]
  #Extract sample description data for each population
  sample.description_data_pop <- sample.description_data_v1_reference %>% filter(Pop == pop)
  #redesign the matrix with correct # column for each population
  design_pop <- model.matrix(~Vern, data = sample.description_data_pop)
  rownames(design_pop) <- sample.description_data_pop$samples
  #using matrix to find out the Normalization dispersions for each population
  dge.data_Normalization_dispersions_pop <- estimateGLMCommonDisp(dge.data_Normalization_pop, design_pop, verbose = TRUE)
  
  #Fit the model for each population dispersion data
  fit_pop <- glmFit(dge.data_Normalization_dispersions_pop, design_pop)
  # Conduct likelihood ratio test for each population
  gt.lrt_pop <- glmLRT(fit_pop, coef = "Vernv2")
  # Summarize the results for each population (up / down regulation)
  result_v1v2_pop <- summary(decideTests(gt.lrt_pop, p = 0.01))
  description <- "v1v2 comparison for population: " 
  results_list[[pop]] <- list(
    Description = paste(description, pop, sep=""),
    Results = result_v1v2_pop)
   }
## Warning: Unknown or uninitialised column: `samples`.
## Disp = 0.29412 , BCV = 0.5423
## Warning: Unknown or uninitialised column: `samples`.
## Disp = 0.15282 , BCV = 0.3909
## Warning: Unknown or uninitialised column: `samples`.
## Disp = 0.41909 , BCV = 0.6474
## Warning: Unknown or uninitialised column: `samples`.
## Disp = 0.78411 , BCV = 0.8855
## Warning: Unknown or uninitialised column: `samples`.
## Disp = 1.80391 , BCV = 1.3431
## Warning: Unknown or uninitialised column: `samples`.
## Disp = 1.24269 , BCV = 1.1148
## Warning: Unknown or uninitialised column: `samples`.
## Disp = 1.04389 , BCV = 1.0217
## Warning: Unknown or uninitialised column: `samples`.
## Disp = 1.2375 , BCV = 1.1124
# Print or return results
print(results_list)
## $bhr
## $bhr$Description
## [1] "v1v2 comparison for population: bhr"
## 
## $bhr$Results
##        Vernv2
## Down      468
## NotSig  29029
## Up        486
## 
## 
## $ihl
## $ihl$Description
## [1] "v1v2 comparison for population: ihl"
## 
## $ihl$Results
##        Vernv2
## Down      374
## NotSig  29304
## Up        305
## 
## 
## $kc2
## $kc2$Description
## [1] "v1v2 comparison for population: kc2"
## 
## $kc2$Results
##        Vernv2
## Down       19
## NotSig  29778
## Up        186
## 
## 
## $lv3
## $lv3$Description
## [1] "v1v2 comparison for population: lv3"
## 
## $lv3$Results
##        Vernv2
## Down     1220
## NotSig  28555
## Up        208
## 
## 
## $sq3
## $sq3$Description
## [1] "v1v2 comparison for population: sq3"
## 
## $sq3$Results
##        Vernv2
## Down      195
## NotSig  29605
## Up        183
## 
## 
## $tm2
## $tm2$Description
## [1] "v1v2 comparison for population: tm2"
## 
## $tm2$Results
##        Vernv2
## Down        0
## NotSig  29926
## Up         57
## 
## 
## $wl2
## $wl2$Description
## [1] "v1v2 comparison for population: wl2"
## 
## $wl2$Results
##        Vernv2
## Down      108
## NotSig  29689
## Up        186
## 
## 
## $yo1
## $yo1$Description
## [1] "v1v2 comparison for population: yo1"
## 
## $yo1$Results
##        Vernv2
## Down       87
## NotSig  29465
## Up        431

Full Table

# Load the necessary library
library(dplyr)

# Define the data frames for each comparison
vo_vs_v1_table <- data.frame(
  Population = c("bhr", "ihl", "kc", "lv", "sq", "tm", "wl", "yo"),
  Down = c(298, 203, 282, 203, 637, 1521, 112, 270),
  NotSig = c(29088, 28977, 28978, 28458, 29302, 27924, 29533, 29373),
  Up = c(597, 803, 723, 1322, 44, 538, 338, 340),
  Comparison = 'vo_vs_v1',
  Elevation = c("low","low","low","high","high","low","high","high")
)

vo_vs_v2_table<- data.frame(
  Population = c("bhr", "ihl", "kc", "lv", "sq", "tm", "wl", "yo"),
  Down = c(530, 368, 105, 1081, 707, 35, 223, 126),
  NotSig = c(28489, 28630, 29197, 27785, 29161, 29708, 29277, 29297),
  Up = c(964, 985, 681, 1117, 115, 240, 483, 560),
  Comparison = 'vo_vs_v2',
  Elevation = c("low","low","low","high","high","low","high","high")
)

v1_vs_v2_table <- data.frame(
  Population = c("bhr", "ihl", "kc", "lv", "sq", "tm", "wl", "yo"),
  Down = c(468, 374, 19, 1220, 195, 0, 108, 87),
  NotSig = c(29029, 29304, 29778, 28555, 29605, 29940, 29689, 29465),
  Up = c(486, 305, 186, 208, 183, 43, 186, 431),
  Comparison = 'v1_vs_v2',
  Elevation = c("low","low","low","high","high","low","high","high")
)


combined_full_data_table <- bind_rows(vo_vs_v1_table, vo_vs_v2_table, v1_vs_v2_table)

write.csv(combined_full_data_table, "combined_full_data_table_v0v1v2.csv", row.names = FALSE)


print(combined_full_data_table)
##    Population Down NotSig   Up Comparison Elevation
## 1         bhr  298  29088  597   vo_vs_v1       low
## 2         ihl  203  28977  803   vo_vs_v1       low
## 3          kc  282  28978  723   vo_vs_v1       low
## 4          lv  203  28458 1322   vo_vs_v1      high
## 5          sq  637  29302   44   vo_vs_v1      high
## 6          tm 1521  27924  538   vo_vs_v1       low
## 7          wl  112  29533  338   vo_vs_v1      high
## 8          yo  270  29373  340   vo_vs_v1      high
## 9         bhr  530  28489  964   vo_vs_v2       low
## 10        ihl  368  28630  985   vo_vs_v2       low
## 11         kc  105  29197  681   vo_vs_v2       low
## 12         lv 1081  27785 1117   vo_vs_v2      high
## 13         sq  707  29161  115   vo_vs_v2      high
## 14         tm   35  29708  240   vo_vs_v2       low
## 15         wl  223  29277  483   vo_vs_v2      high
## 16         yo  126  29297  560   vo_vs_v2      high
## 17        bhr  468  29029  486   v1_vs_v2       low
## 18        ihl  374  29304  305   v1_vs_v2       low
## 19         kc   19  29778  186   v1_vs_v2       low
## 20         lv 1220  28555  208   v1_vs_v2      high
## 21         sq  195  29605  183   v1_vs_v2      high
## 22         tm    0  29940   43   v1_vs_v2       low
## 23         wl  108  29689  186   v1_vs_v2      high
## 24         yo   87  29465  431   v1_vs_v2      high

Histogram of the Full

# Load necessary libraries
library(ggplot2)
library(dplyr)


# Read the data
combined.full.histogram.data <- read.csv("combined_full_data_table_v0v1v2.csv")

# Melt the data to long format for easier plotting
combined.full.histogram.plot <- combined.full.histogram.data %>%
  gather(key = "Regulation", value = "Count", -Population, -Comparison,-Elevation)

# Create a new variable for easier labeling in the plot
combined.full.histogram.plot$Population_Comparison <- paste(combined.full.histogram.plot$Population, combined.full.histogram.plot$Comparison, combined.full.histogram.plot$Elevation,sep = "/")

combined.full.histogram.plot$Population_Comparison <- gsub("v0_vs_v1", "v0v1", combined.full.histogram.plot$Population_Comparison)
combined.full.histogram.plot$Population_Comparison <- gsub("v0_vs_v2", "v0v2", combined.full.histogram.plot$Population_Comparison)
combined.full.histogram.plot$Population_Comparison <- gsub("v1_vs_v2", "v1v2", combined.full.histogram.plot$Population_Comparison)

ggplot(combined.full.histogram.plot, aes(x = Population_Comparison, y = Count, fill = Regulation)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.7)) +  
  geom_text(aes(label = Count), vjust = -0.3, position = position_dodge(width = 0.9)) + 
  labs(title = "Gene Regulation Across Populations and Comparisons",
       x = "Population (Comparison)",
       y = "Count of Genes") +
  theme_minimal() +
  scale_fill_brewer(palette = "Dark2") +  # Set a color palette
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x-axis labels for better visibility

Group_1_tm_lv_data <- filter(combined.full.histogram.data, Population %in% c("tm", "lv"))
Group_2_ihl_wl_data <- filter(combined.full.histogram.data, Population %in% c("ihl", "wl"))
Group_3_bhr_yo_data <- filter(combined.full.histogram.data, Population %in% c("bhr", "yo"))
Group_4_kc_sq_data <- filter(combined.full.histogram.data, Population %in% c("kc", "sq"))
# Load necessary libraries
library(ggplot2)
library(dplyr)
library(tidyr)  # for gather function

# Assuming Group_1_tm_lv_data has already been created by filtering combined.full.histogram.data
# Melt the data to long format for easier plotting
group_1_plot <- Group_1_tm_lv_data %>%
  gather(key = "Regulation", value = "Count", -Population, -Comparison, -Elevation)

# Create a new variable for easier labeling in the plot
group_1_plot$Population_Comparison <- paste(group_1_plot$Population, group_1_plot$Comparison, group_1_plot$Elevation, sep = "/")

# Simplify labels if necessary (depending on your specific labeling needs)
group_1_plot$Population_Comparison <- gsub("v0_vs_v1", "v0v1", group_1_plot$Population_Comparison)
group_1_plot$Population_Comparison <- gsub("v0_vs_v2", "v0v2", group_1_plot$Population_Comparison)
group_1_plot$Population_Comparison <- gsub("v1_vs_v2", "v1v2", group_1_plot$Population_Comparison)

# Plotting
ggplot(group_1_plot, aes(x = Population_Comparison, y = Count, fill = Regulation)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.7)) +  
  geom_text(aes(label = Count), vjust = -0.3, position = position_dodge(width = 0.9)) + 
  labs(title = "Gene Regulation in Group 1 (Populations tm and lv)",
       x = "Population (Comparison/Elevation)",
       y = "Count of Genes") +
  theme_minimal() +
  scale_fill_brewer(palette = "Dark2") +  # Set a color palette
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x-axis labels for better visibility

# Load necessary libraries
library(ggplot2)
library(dplyr)
library(tidyr)  # for gather function

# Assuming Group_2_ihl_wl_data has already been created by filtering combined.full.histogram.data
# Melt the data to long format for easier plotting
group_2_plot <- Group_2_ihl_wl_data %>%
  gather(key = "Regulation", value = "Count", -Population, -Comparison, -Elevation)

# Create a new variable for easier labeling in the plot
group_2_plot$Population_Comparison <- paste(group_2_plot$Population, group_2_plot$Comparison, group_2_plot$Elevation, sep = "/")

# Simplify labels if necessary (depending on your specific labeling needs)
group_2_plot$Population_Comparison <- gsub("v0_vs_v1", "v0v1", group_2_plot$Population_Comparison)
group_2_plot$Population_Comparison <- gsub("v0_vs_v2", "v0v2", group_2_plot$Population_Comparison)
group_2_plot$Population_Comparison <- gsub("v1_vs_v2", "v1v2", group_2_plot$Population_Comparison)

# Plotting
ggplot(group_2_plot, aes(x = Population_Comparison, y = Count, fill = Regulation)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.7)) +  
  geom_text(aes(label = Count), vjust = -0.3, position = position_dodge(width = 0.9)) + 
  labs(title = "Gene Regulation in Group 2 (Populations ihl and wl)",
       x = "Population (Comparison/Elevation)",
       y = "Count of Genes") +
  theme_minimal() +
  scale_fill_brewer(palette = "Dark2") +  # Set a color palette
  theme(axis.text.x = element_text(angle = 45, hjust = 1))  # Rotate x-axis labels for better visibility

# Assuming Group_3_bhr_yo_data has already been created by filtering combined.full.histogram.data
# Melt the data to long format for easier plotting
group_3_plot <- Group_3_bhr_yo_data %>%
  gather(key = "Regulation", value = "Count", -Population, -Comparison, -Elevation)

# Create a new variable for easier labeling in the plot
group_3_plot$Population_Comparison <- paste(group_3_plot$Population, group_3_plot$Comparison, group_3_plot$Elevation, sep = "/")

# Simplify labels if necessary
group_3_plot$Population_Comparison <- gsub("v0_vs_v1", "v0v1", group_3_plot$Population_Comparison)
group_3_plot$Population_Comparison <- gsub("v0_vs_v2", "v0v2", group_3_plot$Population_Comparison)
group_3_plot$Population_Comparison <- gsub("v1_vs_v2", "v1v2", group_3_plot$Population_Comparison)

# Plotting
ggplot(group_3_plot, aes(x = Population_Comparison, y = Count, fill = Regulation)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.7)) +  
  geom_text(aes(label = Count), vjust = -0.3, position = position_dodge(width = 0.9)) + 
  labs(title = "Gene Regulation in Group 3 (Populations bhr and yo)",
       x = "Population (Comparison/Elevation)",
       y = "Count of Genes") +
  theme_minimal() +
  scale_fill_brewer(palette = "Dark2") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Assuming Group_4_kc_sq_data has already been created by filtering combined.full.histogram.data
# Melt the data to long format for easier plotting
group_4_plot <- Group_4_kc_sq_data %>%
  gather(key = "Regulation", value = "Count", -Population, -Comparison, -Elevation)

# Create a new variable for easier labeling in the plot
group_4_plot$Population_Comparison <- paste(group_4_plot$Population, group_4_plot$Comparison, group_4_plot$Elevation, sep = "/")

# Simplify labels if necessary
group_4_plot$Population_Comparison <- gsub("v0_vs_v1", "v0v1", group_4_plot$Population_Comparison)
group_4_plot$Population_Comparison <- gsub("v0_vs_v2", "v0v2", group_4_plot$Population_Comparison)
group_4_plot$Population_Comparison <- gsub("v1_vs_v2", "v1v2", group_4_plot$Population_Comparison)

# Plotting
ggplot(group_4_plot, aes(x = Population_Comparison, y = Count, fill = Regulation)) +
  geom_bar(stat = "identity", position = position_dodge(width = 0.7)) +  
  geom_text(aes(label = Count), vjust = -0.3, position = position_dodge(width = 0.9)) + 
  labs(title = "Gene Regulation in Group 4 (Populations kc and sq)",
       x = "Population (Comparison/Elevation)",
       y = "Count of Genes") +
  theme_minimal() +
  scale_fill_brewer(palette = "Dark2") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))